R is ‘GNU S’, a freely available language and environment for statistical computing and graphics which provides a wide variety of statistical and graphical techniques
Download and install R (precompiled binary distribution) from UCLA http://cran.stat.ucla.edu, the nearest R mirror site http://cran.r-project.org/mirrors.html
RStudio is a set of integrated development environment (IDE) designed to help you be more prudictive with R. It includes
Download and install RStudio from https://www.rstudio.com/products/rstudio/download
1 + 2
## [1] 3
2^4
## [1] 16
log(16, 2)
## [1] 4
log(1000, 3)
## [1] 6.28771
choose(5, 2)
## [1] 10
relapse <- TRUE
typeof( relapse )
## [1] "logical"
relapse
## [1] TRUE
sbp <- c(160, 110, 105, 150, 170)
sbp > 120
## [1] TRUE FALSE FALSE TRUE TRUE
diag <- ifelse( sbp > 120, "hypertension", "normal")
ldl <- 100
typeof( ldl )
## [1] "double"
hdl <- 60
vldl <- 25
totalChol <- hdl + ldl + vldl
totalChol
## [1] 185
cholesterol <- c(177, 193, 195, 209, 226)
typeof( cholesterol )
## [1] "double"
length( cholesterol )
## [1] 5
mean( cholesterol )
## [1] 200
max( cholesterol )
## [1] 226
genotype <- c("AA", "AG", "GG")
typeof( genotype )
## [1] "character"
alleles <- c( "AG", "GG", "AA", "AA", "AA", "AA", "AA", "AG", "AG", "GG", "GG", "GG", "GG", "AA")
typeof( alleles )
## [1] "character"
alleles
## [1] "AG" "GG" "AA" "AA" "AA" "AA" "AA" "AG" "AG" "GG" "GG" "GG" "GG" "AA"
alleles <- factor( alleles )
levels( alleles )
## [1] "AA" "AG" "GG"
typeof( alleles )
## [1] "integer"
alleles
## [1] AG GG AA AA AA AA AA AG AG GG GG GG GG AA
## Levels: AA AG GG
table( alleles )
## alleles
## AA AG GG
## 6 3 5
genotype <- c("AA", "AA", "AA", "AA", "AA", "GG", "GG", "GG", "GG")
mrna <- c(100, 90, 105, 87, 92, 20, 24, 35, 27)
ethnicity <- c("AFR", "AFR", "AFR", "AMR", "AFR", "ASN", "EUR", "EUR", "EUR")
relapse <- c("yes", "yes", "yes", "no", "yes", "no", "no", "no", "no")
study <- data.frame( genotype, mrna, ethnicity, relapse)
dim( study )
## [1] 9 4
head( study )
## genotype mrna ethnicity relapse
## 1 AA 100 AFR yes
## 2 AA 90 AFR yes
## 3 AA 105 AFR yes
## 4 AA 87 AMR no
## 5 AA 92 AFR yes
## 6 GG 20 ASN no
# Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.
# They describe characteristics of the cell nuclei present in the image.
url <-"http://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
pima <- read.table( url, sep=",")
colnames( pima ) <- c("pregnancy", "gtt", "dbp", "skinthickness", "insulin", "bmi", "familyhistory", "age", "diabetes")
head( pima )
## pregnancy gtt dbp skinthickness insulin bmi familyhistory age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 1
## 2 1 85 66 29 0 26.6 0.351 31 0
## 3 8 183 64 0 0 23.3 0.672 32 1
## 4 1 89 66 23 94 28.1 0.167 21 0
## 5 0 137 40 35 168 43.1 2.288 33 1
## 6 5 116 74 0 0 25.6 0.201 30 0
dim( pima )
## [1] 768 9
table( pima$diabetes )
##
## 0 1
## 500 268
boxplot( gtt ~ diabetes,
data = pima)
boxplot( gtt ~ diabetes,
data = pima,
boxwex = 0.3,
main = "Pima Indians Diabetes Study",
ylab = "Glucose concentration",
xlab = "Diabetes")
case <- pima$gtt[ pima$diabetes == 1]
control <- pima$gtt[ pima$diabetes == 0]
t.test( case, control )
##
## Welch Two Sample t-test
##
## data: case and control
## t = 13.752, df = 461.33, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 26.80786 35.74707
## sample estimates:
## mean of x mean of y
## 141.2575 109.9800
case <- pima$dbp[ pima$diabetes == 1]
control <- pima$dbp[ pima$diabetes == 0]
t.test( case, control )
##
## Welch Two Sample t-test
##
## data: case and control
## t = 1.7131, df = 471.31, p-value = 0.08735
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.388326 5.669580
## sample estimates:
## mean of x mean of y
## 70.82463 68.18400
boxplot( dbp ~ diabetes,
data = pima,
boxwex = 0.3,
main = "Pima Indians Diabetes Study",
ylab = "Diastolic blood pressure",
xlab = "Diabetes")
m1 <- glm( diabetes ~ pregnancy + gtt + dbp + bmi + familyhistory,
data=pima,
family = "binomial")
round( summary( m1 )$coef, 3)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.955 0.676 -11.771 0.000
## pregnancy 0.153 0.028 5.514 0.000
## gtt 0.035 0.003 10.213 0.000
## dbp -0.012 0.005 -2.387 0.017
## bmi 0.085 0.014 6.006 0.000
## familyhistory 0.911 0.294 3.097 0.002
library("forestmodel")
## Loading required package: ggplot2
forest_model( m1 )
# INSTALL REQUIRED R PACKAGES
#source("http://bioconductor.org/biocLite.R"); biocLite("SNPRelate");
# LOAD R PACKAGES
library(SNPRelate)
## Loading required package: gdsfmt
## SNPRelate -- supported by Streaming SIMD Extensions 2 (SSE2)
# PREPARE INPUT DATA
genofile = openfn.gds(snpgdsExampleFileName())
# PRUNE SNPs BASED ON LINKAGE DISEQUILIBRIUM (LD)
snpset = snpgdsLDpruning(genofile, ld.threshold=0.2); snpset.id = unlist(snpset)
## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
## SNP pruning based on LD:
## Excluding 365 SNPs on non-autosomes
## Excluding 1 SNP (monomorphic: TRUE, < MAF: NaN, or > missing rate: NaN)
## Working space: 279 samples, 8,722 SNPs
## using 1 (CPU) core
## Sliding window: 500000 basepairs, Inf SNPs
## |LD| threshold: 0.2
## Chromosome 1: 75.84%, 543/716
## Chromosome 2: 73.05%, 542/742
## Chromosome 3: 74.38%, 453/609
## Chromosome 4: 73.67%, 414/562
## Chromosome 5: 76.68%, 434/566
## Chromosome 6: 74.87%, 423/565
## Chromosome 7: 75.42%, 356/472
## Chromosome 8: 71.52%, 349/488
## Chromosome 9: 77.64%, 323/416
## Chromosome 10: 74.12%, 358/483
## Chromosome 11: 77.63%, 347/447
## Chromosome 12: 76.58%, 327/427
## Chromosome 13: 76.45%, 263/344
## Chromosome 14: 76.60%, 216/282
## Chromosome 15: 76.34%, 200/262
## Chromosome 16: 72.30%, 201/278
## Chromosome 17: 73.91%, 153/207
## Chromosome 18: 73.68%, 196/266
## Chromosome 19: 83.33%, 100/120
## Chromosome 20: 71.62%, 164/229
## Chromosome 21: 76.19%, 96/126
## Chromosome 22: 77.59%, 90/116
## 6548 SNPs are selected in total.
# PEFORM PCA
pca = snpgdsPCA(genofile,
maf=0.05,
missing.rate=0.05,
snp.id=snpset.id,
num.thread=2)
## Hint: it is suggested to call `snpgdsOpen' to open a SNP GDS file instead of `openfn.gds'.
## Principal Component Analysis (PCA) on genotypes:
## Excluding 2,540 SNPs (non-autosomes or non-selection)
## Excluding 1,105 SNPs (monomorphic: TRUE, < MAF: 0.05, or > missing rate: 0.05)
## Working space: 279 samples, 5,443 SNPs
## using 2 (CPU) cores
## PCA: the sum of all selected genotypes (0, 1 and 2) = 1526608
## Wed Jan 11 15:17:48 2017 (internal increment: 2684)
##
[..................................................] 0%, ETC: ---
[==================================================] 100%, completed
## Wed Jan 11 15:17:48 2017 Begin (eigenvalues and eigenvectors)
## Wed Jan 11 15:17:48 2017 Done.
# DRAW A PLOT
plot(pca$eigenvect[,2], pca$eigenvect[,1],
xlab="Principal Component 2",
ylab="Principal Component 1",
type="n")
race = as.factor(read.gdsn( index.gdsn(genofile, index=c("sample.annot", "pop.group"))))
table( race )
## race
## CEU HCB JPT YRI
## 92 47 47 93
points(pca$eigenvect[,2], pca$eigenvect[,1], col=race)
legend("bottomleft",
title="Population",
legend=levels(race),
text.col=1:nlevels(race))
library(manhattanly)
## See example usage at http://sahirbhatnagar.com/manhattanly/
dim( HapMap )
## [1] 14412 8
head( HapMap )
## CHR BP P SNP ZSCORE EFFECTSIZE GENE DISTANCE
## 1 1 937641 0.3353438 rs9697358 0.9634 -0.0946 ISG15 1068
## 2 1 1136887 0.2458571 rs34945898 1.1605 -0.0947 TNFRSF4 0
## 3 1 2116240 0.8232859 rs12034613 0.2233 -0.0741 FP7162 0
## 4 1 2310562 0.4932038 rs4648633 0.6852 0.0146 MORN1 0
## 5 1 2681715 0.6053916 rs4430271 0.5167 0.1234 MMEL1 127427
## 6 1 2917484 0.1944431 rs6685625 1.2975 0.1979 ACTRT2 10421
HapMap[ HapMap$P < 10^(-8), ]
## CHR BP P SNP ZSCORE EFFECTSIZE GENE
## 4338 5 28801403 4.70701e-09 rs16898108 5.8572 2.0314 LOC729862
## 4339 5 29102367 9.99901e-09 rs610094 5.7307 1.2378 AK098570
## 4340 5 29214921 6.70501e-09 rs16898772 5.7982 0.3107 AK098570
## 4341 5 29691322 6.22801e-09 rs16899251 5.8105 -0.8520 AK098570
## 4342 5 29775399 5.02201e-09 rs6450718 5.8464 0.4985 AK098570
## 4343 5 29950601 9.86401e-09 rs2972799 5.7331 -0.5233 AK098570
## 4344 5 30019370 3.95101e-09 rs2195387 5.8862 0.6305 AK098570
## 4345 5 30064404 8.01901e-09 rs16899785 5.7681 -0.7904 AK098570
## 4346 5 30262253 6.75010e-10 rs16900007 6.1718 1.6960 CDH6
## 4347 5 30797178 3.41101e-09 rs1039325 5.9105 1.3266 CDH6
## DISTANCE
## 4338 161330
## 4339 77057
## 4340 6700
## 4341 483101
## 4342 567178
## 4343 742380
## 4344 811149
## 4345 856183
## 4346 967299
## 4347 432374
manhattanly(HapMap,
snp = "SNP",
gene = "GENE",
annotation1 = "ZSCORE", annotation2 = "EFFECTSIZE",
highlight = significantSNP)